# load necessary libraries
library(Seurat)
library(dplyr)
library(ggplot2)
# library(CelltypeR)
Read in the flow data This data should be the gated live cells.
All samples need to be in one folder.
input_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/FlowDataFiles/9MBO"
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/"
# 1.a Read in FlowJo Files
# choose to downsample to 9000 where possible
# check this - I don't think it worked
flowset <- fsc_to_fs(input_path, downsample = 9000)
# down sample can be a number, 'none' or 'min'
# look at file names and rename with shorter sample names
sampleNames(flowset)
[1] "2020-03-06- export_bioinfo_3450c_live cells.fcs"
[2] "2020-03-06- export_bioinfo_AIW_live cells.fcs"
[3] "2020-03-06- export_bioinfo_AJG_live cells.fcs"
[4] "2020-03-17- export_bioinfo_old 3450c_live cells.fcs"
[5] "2020-03-17- export_bioinfo_old AIW_live cells.fcs"
[6] "2020-03-17- export_bioinfo_old AJG_live cells.fcs"
[7] "2020-03-17- export_bioinfo_young 3450c_live cells.fcs"
[8] "2020-03-17- export_bioinfo_young AIW_live cells.fcs"
[9] "2020-03-17- export_bioinfo_young AJG_live cells.fcs"
sampleNames(flowset) <- sampleNames(flowset) <- c("3450_0306","AIW002_0306","AJG001C_0306","3450_0317A","AIW002_0317A","AJG001C_0317A","3450_0317B","AIW002_0317B","AJG001C_0317B")
sampleNames(flowset)
[1] "3450_0306" "AIW002_0306" "AJG001C_0306" "3450_0317A"
[5] "AIW002_0317A" "AJG001C_0317A" "3450_0317B" "AIW002_0317B"
[9] "AJG001C_0317B"
# if we want to save the flowset object now we can
# this would be read back in with flowset
#
Harmonize data to remove batch or technical variation
This requires us to look and see where there are two peaks to align. We need to visualize the peaks of the transformed data before aligning.
# we can decided what level of processing to choose with the argument 'processing'
# biexp only applies a biexponential transformation
# align applies biexp transform and then aligns peaks
# retro (default), transforms, aligns and then reverse transforms
flowset_biexp <- harmonize(flowset, processing = 'biexp')
# we can visualize the peaks to see where there are two peaks for alignment
# we need to enter the column index for which peaks to align, the alignment for one or two peaks is not the same.
#plotdensity_flowset(flowset)
#plotdensity_flowset(flowset_biexp) # to see the peaks
flowset_align <- harmonize(flowset, processing = 'align',
two_peaks = c(7:20),
one_peak = c(1:6,21), threshold = 0.01)
Adjusting the distance between landmarks
.........
Adjusting the distance between landmarks
.........
flowset_retro <- harmonize(flowset, processing = 'retro',
two_peaks = c(7:20),
one_peak = c(1:6,21), threshold = 0.01)
Adjusting the distance between landmarks
.........
Adjusting the distance between landmarks
.........
df <- flowset_to_csv(flowset_retro)
Now we have made all the different processing of the fsc files. We can visualize the intensity in cell density plots to see the alignment
#plotdensity_flowset(flowset)
plotdensity_flowset(flowset_biexp)
Warning in melt(lapply(as.list(flowset@frames), function(x) { :
The melt generic in data.table has been passed a list and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(lapply(as.list(flowset@frames), function(x) { x = as.data.frame(x@exprs)})). In the next version, this warning will become an error.
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
Warning in geom_density_ridges(alpha = 0.4, verbose = FALSE) :
Ignoring unknown parameters: `verbose`
Picking joint bandwidth of 0.0468
Picking joint bandwidth of 0.0299
Picking joint bandwidth of 0.0164
Picking joint bandwidth of 0.082
Picking joint bandwidth of 0.0669
Picking joint bandwidth of 0.0172
Picking joint bandwidth of 0.818
Picking joint bandwidth of 0.149
Picking joint bandwidth of 0.722
Picking joint bandwidth of 0.862
Picking joint bandwidth of 0.255
Picking joint bandwidth of 0.24
Picking joint bandwidth of 0.157
Picking joint bandwidth of 0.381
Picking joint bandwidth of 0.614
Picking joint bandwidth of 0.729
Picking joint bandwidth of 0.564
Picking joint bandwidth of 0.241
Picking joint bandwidth of 0.754
Picking joint bandwidth of 0.661
Picking joint bandwidth of 0.102
plotdensity_flowset(flowset_align)
Warning in melt(lapply(as.list(flowset@frames), function(x) { :
The melt generic in data.table has been passed a list and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(lapply(as.list(flowset@frames), function(x) { x = as.data.frame(x@exprs)})). In the next version, this warning will become an error.
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
Warning in geom_density_ridges(alpha = 0.4, verbose = FALSE) :
Ignoring unknown parameters: `verbose`
Picking joint bandwidth of 0.0468
Picking joint bandwidth of 0.0299
Picking joint bandwidth of 0.0164
Picking joint bandwidth of 0.082
Picking joint bandwidth of 0.0669
Picking joint bandwidth of 0.0172
Picking joint bandwidth of 0.819
Picking joint bandwidth of 0.194
Picking joint bandwidth of 0.724
Picking joint bandwidth of 0.863
Picking joint bandwidth of 0.301
Picking joint bandwidth of 0.273
Picking joint bandwidth of 0.156
Picking joint bandwidth of 0.39
Picking joint bandwidth of 0.616
Picking joint bandwidth of 0.742
Picking joint bandwidth of 0.561
Picking joint bandwidth of 0.241
Picking joint bandwidth of 0.763
Picking joint bandwidth of 0.657
Picking joint bandwidth of 0.102
#plotdensity_flowset(flowset_retro)
Now we will make test out clustering. For Seurat clustering and Phenograph we will make the Seurat object Flowsome takes in the dataframe directly.
# the function make_seu will take in the df of expression and Antibody/Marker list as a vector and create a seurat object. Values are scaled. Marker expression will be in the "RNA" slot. PCA is calculated using AB vector as the features
# make sure to always keep the same antibody order or your labels will not be correct
# antibody features in order to appear on the plots
AB <- c("CD24","CD56","CD29","CD15","CD184","CD133","CD71","CD44","GLAST","AQP4","HepaCAM", "CD140a","O4")
seu <- make_seu(df, AB_vector = AB)
Warning: Using an external vector in selections was deprecated in tidyselect
1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(AB_vector)
# Now:
data %>% select(all_of(AB_vector))
See
<https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning
was generated.
Centering and scaling data matrix
|
| | 0%
|
|============================================================| 100%
Warning in irlba(A = t(x = object), nv = npcs, ...) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
PC_ 1
Positive: CD44, CD184, CD24, CD56, CD15, CD133
Negative: CD140a, O4, CD29, CD71, HepaCAM, AQP4
PC_ 2
Positive: CD29, CD44, CD140a, CD184, CD71, O4
Negative: CD15, CD56, CD24, GLAST, CD133, AQP4
PC_ 3
Positive: CD56, CD29, CD133, CD24, AQP4, GLAST
Negative: CD44, CD184, CD140a, O4, HepaCAM, CD15
PC_ 4
Positive: CD133, AQP4, CD184, HepaCAM, CD71, GLAST
Negative: CD56, CD24, CD15, CD44, CD29, CD140a
PC_ 5
Positive: CD184, CD24, CD29, CD140a, O4, HepaCAM
Negative: CD44, CD133, CD56, CD71, AQP4, CD15
Save dataframe and seurat object for later
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/"
# to save the df for later
# write.csv(df, "pathway/filename.csv")
#write.csv(df, paste(output_path,"df9000Feb15.csv", sep = ""))
write.csv(df, paste(output_path,"df9000Feb24.csv", sep = ""))
# save the seurat object
saveRDS(seu, paste(output_path,"seu9000Feb24.RDS", sep = ""))
Read in the csv of the flow files processed and the seurat object
df.input <- read.csv("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/df9000Feb24.csv")
seu <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/seu9000Feb24.RDS")
Test Flowsom
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/exp_clusters/flow/"
flow.test <- explore_param(seu, #for phenograph and louvain only
cluster_method = 'flowsom', #take 1 cluster method
df_input= df.input, #needed if input "flowsom"
flow_k = c(3,5,10,20), #k for flowsom
pheno_lou_kn = NULL, #kn for phenograph or louvain
lou_resolution = NULL, #resolution for louvain
run.plot = TRUE, #print if run
run.stats = TRUE, #print and return if run
save_to = NULL)
Building SOM
Mapping data to SOM
Building MST
Computing nearest neighbor graph
Computing SNN
09:43:02 UMAP embedding parameters a = 0.9922 b = 1.112
09:43:02 Read 73578 rows and found 12 numeric columns
09:43:02 Using Annoy for neighbor search, n_neighbors = 30
09:43:02 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
09:43:07 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmp2693yU/file1b1d28b18f4f
09:43:07 Searching Annoy index using 1 thread, search_k = 3000
09:43:30 Annoy recall = 100%
09:43:31 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
09:43:34 Initializing from normalized Laplacian + noise (using irlba)
09:43:41 Commencing optimization for 200 epochs, with 3143754 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
09:44:27 Optimization finished
See Flowsom stats plots
p1 <- stats_plot(stats_ls = flow.test,
save_to = output_path,
clust_method = "flowsom")
Error in complete.cases(stats_ls) : invalid 'type' (unknown) of argument
Phenograph
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/exp_clusters/pheno/"
pheno.test <- explore_param(seu, #for phenograph and louvain only
cluster_method = 'phenograph', #take 1 cluster method
df_input= df.input, #needed if input "flowsom"
flow_k = NULL, #k for flowsom
pheno_lou_kn = c(300,200,100), #kn for phenograph or louvain
lou_resolution = NULL, #resolution for louvain
run.plot = TRUE, #print if run
run.stats = TRUE, #print and return if run
save_to = NULL)
Computing nearest neighbor graph
Computing SNN
13:55:38 UMAP embedding parameters a = 0.9922 b = 1.112
13:55:38 Read 73578 rows and found 12 numeric columns
13:55:38 Using Annoy for neighbor search, n_neighbors = 300
13:55:38 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
13:55:44 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmp2693yU/file1b1d18709c3d
13:55:44 Searching Annoy index using 1 thread, search_k = 30000
13:58:25 Annoy recall = 100%
13:58:26 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 300
13:58:54 Initializing from normalized Laplacian + noise (using irlba)
13:59:55 Commencing optimization for 200 epochs, with 15538816 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:01:19 Optimization finished
Run Rphenograph starts:
-Input data of 73578 rows and 13 columns
-k is set to 300
Finding nearest neighbors...DONE ~ 26.478 s
Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 1238.504 s
Build undirected graph from the weighted links...DONE ~ 114.634 s
Run louvain clustering on the graph ...DONE ~ 115.281 s
Run Rphenograph DONE, totally takes 1494.89699999988s.
Return a community class
-Modularity value: 0.8205017
-Number of clusters: 18
Computing nearest neighbor graph
Computing SNN
14:29:01 UMAP embedding parameters a = 0.9922 b = 1.112
14:29:01 Read 73578 rows and found 12 numeric columns
14:29:01 Using Annoy for neighbor search, n_neighbors = 200
14:29:01 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:29:06 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmp2693yU/file1b1d12050077
14:29:06 Searching Annoy index using 1 thread, search_k = 20000
14:31:02 Annoy recall = 100%
14:31:05 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 200
14:31:26 Initializing from normalized Laplacian + noise (using irlba)
14:32:09 Commencing optimization for 200 epochs, with 13904824 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:33:29 Optimization finished
Run Rphenograph starts:
-Input data of 73578 rows and 13 columns
-k is set to 200
Finding nearest neighbors...DONE ~ 21.326 s
Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 604.848 s
Build undirected graph from the weighted links...DONE ~ 75.626 s
Run louvain clustering on the graph ...DONE ~ 93.603 s
Run Rphenograph DONE, totally takes 795.402999999933s.
Return a community class
-Modularity value: 0.8319901
-Number of clusters: 21
Computing nearest neighbor graph
Computing SNN
14:48:26 UMAP embedding parameters a = 0.9922 b = 1.112
14:48:26 Read 73578 rows and found 12 numeric columns
14:48:26 Using Annoy for neighbor search, n_neighbors = 100
14:48:26 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:48:32 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmp2693yU/file1b1d6618ddf
14:48:32 Searching Annoy index using 1 thread, search_k = 10000
14:49:35 Annoy recall = 100%
14:49:38 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 100
14:49:48 Initializing from normalized Laplacian + noise (using irlba)
14:50:09 Commencing optimization for 200 epochs, with 9527954 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:51:18 Optimization finished
Run Rphenograph starts:
-Input data of 73578 rows and 13 columns
-k is set to 100
Finding nearest neighbors...DONE ~ 15.125 s
Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 153.399 s
Build undirected graph from the weighted links...DONE ~ 37.717 s
Run louvain clustering on the graph ...DONE ~ 38.702 s
Run Rphenograph DONE, totally takes 244.942999999854s.
Return a community class
-Modularity value: 0.8479152
-Number of clusters: 25
# Create cell type clusters functions
# explore_param (Shuming)
#Intrinsic stats (Shuming)
# # clust_stability (Shuming)
library(clusterSim) #for dbi
library(FlowSOM)
library(flowCore)
library(cluster) #for silhouette score
library(fpc) #for calinhara
library(Seurat)
library(dplyr)
library(ggplot2)
library(clustree) #for clustree plot
library(Rphenograph) #for phenograph
library(flexclust)#for adjusted rand index
library(ggplot2) #for the plot function
####################################################################################################
# explore_param
# reads in csv files with flow cytometry experiments or seurat data object depending on arguments
# runs FlowSom, Phenograph, Louvain (Seurat)
# input arguments - cluster method, for flowsom k, for phenograph and louvain k paramater, for louvain resolution
# select outputs, generates - UMAPS, heatmaps, clustree : save to folder or put in global environ
# creates list object to run stats
#JAN2923EDIT:im going to make it so that you can only run 1 clustering at a time,
#if the graph could not be run, i will report an error message and skip it
# input is a seurat object
explore_param <- function(input, #for phenograph and louvain only
cluster_method, #take 1 cluster method
df_input, #needed if input "flowsom"
flow_k = NULL, #k for flowsom
pheno_lou_kn = NULL, #kn for phenograph or louvain
lou_resolution = NULL, #resolution for louvain
run.plot = FALSE, #print if run
run.stats = TRUE, #print and return if run
save_to = NULL) { #need it if save plots or stats
#call helper functions depending on the cluster_method requested
if (cluster_method == "louvain") {
cl <- louvain(input = input, #seu object
df_input = df_input,
pheno_lou_kn = pheno_lou_kn,
resolutions = lou_resolution,
run.plot = run.plot,
run.stats = run.stats,
save_to = save_to)
}
if (cluster_method == "phenograph") {
cl <- phenograph(input = input,
df_input = df_input,
pheno_lou_kn = pheno_lou_kn,
run.stats = run.stats,
run.plot = run.plot,
save_to = save_to)
}
if (cluster_method == "flowsom") {
cl <- flowsom(input = input,
df_input = df_input,
flow_k = flow_k,
run.stats = run.stats,
run.plot = run.plot,
save_to = save_to)
}
if (run.plot || run.stats) {
return(list(cluster = cl))
}
}
# function #1:
flowsom <- function(input, #seurat
df_input, #the processed df2 file before being converted to seurat
flow_k,
run.stats = TRUE,
run.plot = TRUE,
save_to = NULL) {
clust_method <- "flowsom" #for naming the graphs
# create the flowframe. If reading in a csv convert to flowset
#only select numerical columns, exclude columns like X-column
frame <- new("flowFrame", exprs = as.matrix(df_input[, sapply(df_input, is.numeric)])) #convert input to flowframe, remove non-numerical col
fs <- ReadInput(frame) #convert flowframe to flowsom object
fs <- BuildSOM(fs) # build flowSOM object, no need for -1 because X column is removed
fs <- BuildMST(fs) # build minimum spanning tree
#subsampling for silhouette score, n=9000, if less than 9000 cells, use the full size
if (run.stats) {
m <- t(as.matrix(GetAssayData(object = input, slot = "counts")))
row_n <- sample(1 : nrow(m), ifelse(nrow(m) > 9000, 9000, nrow(m)))
dis <- daisy(m[row_n, ]) #dissimilarities distance
}
# #store seurat objects and stats:
stats_ls <- vector() #create a list to store all stats
# kn = round(sqrt(dim(df2)[1]))
input <- FindNeighbors(input, dims = 1:12)
input <- RunUMAP(input, dims = 1:12)
for (i in flow_k){
flowSOMcluster <- metaClustering_consensus(fs$map$codes, k = i, seed=42)
clust_name = paste('FlowSom.k.', i, sep="")
# add the cluster ID into seurat object to visualize
input <- AddMetaData(object = input,
metadata = flowSOMcluster[fs$map$mapping[, 1]],
col.name = paste('FlowSom.k.', i, sep="")) #clustering name
number.clusters <- length(unique(flowSOMcluster[fs$map$mapping[,1]]))
# save feature plots of this UMAP
if (run.plot) {
p3 <- DimPlot(input, reduction = "umap", repel = TRUE, label = TRUE,
group.by = clust_name) # will automatically group by active ident
p4 <- DoHeatmap(input, features = rownames(input), group.by = clust_name) # heatmap
print(p3)
print(p4)
if (!is.null(save_to)) {
png(filename=paste(save_to, clust_method, "UMAPclusters_k", i, ".png", sep = ""), width = 2000, height = 1200, res = 300)
print(p3)
dev.off()
png(filename = paste(save_to, clust_method, "_Heatmap_clusters_k",i,".png", sep = ""), width = 5000, height = 3000, res = 300)
print(p4)
dev.off()
}
suppressWarnings(rm(p3, p4))
}
if (run.stats) {
# add stats
stats_ls <- c(
stats_ls,
# kn, i, #krange
number.clusters, #number of clusters
mean(silhouette(flowSOMcluster[fs$map$mapping[, 1]][row_n], dis)[, 3]), #silhouette score
calinhara(m, flowSOMcluster[fs$map$mapping[, 1]], cn = i), #Calinski-Harabasz index
index.DB(x = df_input[, sapply(df_input, is.numeric)],
cl=as.numeric(flowSOMcluster[fs$map$mapping[, 1]]))$DB)# Davies–Bouldin index
}
}
if (run.stats) {
#make statsl into matrix then dataframe
stats_df <- as.data.frame(matrix(stats_ls, ncol = 4, byrow = TRUE))
colnames(stats_df) <- c(
# "kn", "krange",
"number.cluster", "silhouette.score", "calinski.harabasz", "davies.bouldin")
print(stats_df)
write.csv(stats_df, paste(save_to, clust_method, '_stats.csv', sep = ""), row.names = F)
}
# plots clustree result
if (run.plot) {
if(length(flow_k) > 2) {
# make clustree plot
p5 <- clustree(input, prefix ='FlowSom.k.')
print(p5)
if (!is.null(save_to)) {
png(filename=paste(save_to, clust_method, 'Clustree.png',sep = ""), width = 1200, height = 2000, res = 300)
print(p5)
dev.off()
}
}
}
if (run.stats && !is.null(save_to)) {
return(stats_df)
}
}
#helper functions #2:
phenograph <- function(input,
df_input,
pheno_lou_kn,
run.stats = TRUE,
run.plot = TRUE,
save_to = NULL) {
clust_method <- "phenograph"
#subsampling for silhouette score, n=9000, if less than 9000 cells, use the full size
if (run.stats) {
m <- t(as.matrix(GetAssayData(object = input, slot = "counts")))
row_n <- sample(1 : nrow(m), ifelse(nrow(m) > 9000, 9000, nrow(m)))
dis <- daisy(m[row_n, ]) #dissimilarities distance
}
#create a list to store stats:
stats_ls <- vector() #stats list returned
# here is the clustering
for (i in pheno_lou_kn){
input <- FindNeighbors(input, dims = 1:12, k.param = i)
input <- RunUMAP(input, dims = 1:12, n.neighbors = i)
### run phenograph clustering
Rphenograph_out_flow <- Rphenograph(m, k = i)
phenocluster <- factor(membership(Rphenograph_out_flow[[2]]))
clust_name = paste('Pheno.kn.',i,sep="")
# add the cluster ID into seurat object to visualize
input <- AddMetaData(object = input, phenocluster, col.name = clust_name)
number.clusters <- length(unique(phenocluster))
### make umap
if (run.plot) {
# heatmap
p4 <- DoHeatmap(input, features = rownames(input), group.by = clust_name) # heatmap
# UMAP
p3 <- DimPlot(input, reduction = "umap", repel = TRUE, label = TRUE, group.by = clust_name)
print(p3)
print(p4)
# save UMAP grouped
if (!is.null(save_to)) {
png(filename=paste(save_to, clust_method, "_UMAPclusters_kn",i,".png", sep=""), width =2000, height = 1200, res=300)
print(p3)
dev.off()
}
suppressWarnings(rm(p3))
}
stats_ls <- c(stats_ls,
i, #kn
number.clusters, #number of clusters
mean(silhouette(as.numeric(phenocluster[row_n]),dis)[, 3]), #silhouette score
calinhara(m,phenocluster,cn=i), #Calinski-Harabasz index
index.DB(x = df_input[, sapply(df_input, is.numeric)], cl=as.numeric(phenocluster))$DB) # Davies–Bouldin index
}
if (run.plot && (length(pheno_lou_kn) > 2)) {
# make clustree plot
if(length(pheno_lou_kn) > 2) {
p4 <- clustree(input, prefix ='Pheno.kn.')
print(p4)
if (!is.null(save_to)) {
png(filename=paste(save_to, clust_method,'Clustree.png',sep=""), width = 1000, height = 2000, res=300)
print(clustree(input, prefix ='Pheno.kn.'))
dev.off()
}
}
}
if (run.stats) { # save the stats
stats_df <- as.data.frame(matrix(stats_ls, ncol = 5, byrow = TRUE))
colnames(stats_df) <- c("kn", "number.cluster", "silhouette.score", "calinski.harabasz", "davies.bouldin")
write.csv(stats_df, paste(save_to, clust_method, '_stats.csv', sep = ""), row.names = F)
}
if (run.stats && !is.null(save_to)) {
return(stats_df)
}
}
#function #3 - seurat louvain clustering:
louvain <- function(input, #seu object
df_input,
pheno_lou_kn,
resolutions,
run.plot = FALSE, #option to save the graphs
run.stats = TRUE, #option to save stats list
save_to = NULL #only required when save is TRUE
) {
clust_method <- "louvain"
if (run.stats) {
#subsampling for silhouette score, n=9000, if less than 9000 cells, use the full size
m <- t(as.matrix(GetAssayData(object = input, slot = "counts")))
row_n <- sample(1 : nrow(m), ifelse(nrow(m) > 9000, 9000, nrow(m)))
dis <- daisy(m[row_n, ]) #dissimilarities distance
statsl <- vector() #stats list returned
}
# In the loop
# save a data object for each kn - will only keep temporarily
# the clusters will write over with each new kn
for (i in pheno_lou_kn){
input <- FindNeighbors(input, dims = 1:12, k.param = i)
input <- RunUMAP(input, dims = 1:12, n.neighbors = i)
# save feature plots of this UMAP
if (run.plot) {
p1 <- FeaturePlot(input,
features = rownames(input),
slot = 'scale.data',
min.cutoff = 'q1',
max.cutoff ='99',
label.size = 1)+
theme(plot.title = element_text(size = 0.1))
p2 <- DimPlot(input, group.by = 'Sample', label.size = 1)
print(p1)
print(p2)
if (!is.null(save_to)) {
png(filename=paste(save_to, clust_method, "UMAPfeatures_kn", i, ".pdf", sep = ""), width = 2000, height = 1200, res = 300)
print(p1)
dev.off()
# look at batches
png(filename=paste(save_to, clust_method, "UMAP_Sample_kn", i, ".pdf", sep = ""), width = 2000, height = 1200, res = 300)
print(p2)
dev.off()
}
suppressWarnings(rm(p1, p2))
}
for (j in resolutions) {
input <- FindClusters(input, resolution = j)
louvainCluster <- input@meta.data$seurat_clusters
#stats
if (run.stats) {
statsl <- c(statsl,
i, # kn
j, # resolution
length(unique(louvainCluster))) # number of clusters (nc
if (length(unique(louvainCluster)) == 1) {#skip the ones with only 1 cluster
statsl <- c(statsl, rep(NA, 3))
} else {
statsl <- c(statsl,
mean(silhouette(as.numeric(louvainCluster[row_n]), dis)[, 3]), # silhouette score, summary(dis)[[4]] = subsample number
calinhara(m, louvainCluster, cn = i),# Calinski-Harabasz index
index.DB(x = df_input[, sapply(df_input, is.numeric)], cl = as.numeric(louvainCluster))$DB) # Davies–Bouldin index
}
}
if (run.plot) {# make UMAP grouped plots
p3 <- DimPlot(input, reduction = "umap", repel = TRUE, label = TRUE)
p4 <- DoHeatmap(input, features = rownames(input), size = 10)+
theme(text = element_text(size = 30)) # heatmap
print(p3)
print(p4)
if (!is.null(save_to)) {
png(filename=paste(save_to, clust_method, "UMAPclusters_kn",
i, "_res_", j, ".pdf", sep = ""),
width = 2000, height = 1200, res = 300)
print(p3)
dev.off()
png(filename=paste(save_to, clust_method, "Heatmapclusters_kn",
i, "_res_", j, ".pdf", sep = ""),
width = 2000, height = 1200, res = 300)
print(p4)
dev.off()
}
suppressWarnings(rm(p3, p4))
}
}
#outside of resolution loop, in kn loop now:
if (run.plot & (length(resolutions) > 1)) {
# run clustree
p5 <- clustree(input, prefix ='RNA_snn_res.')
print(p5)
if (!is.null(save_to)) {
png(filename=paste(save_to, clust_method, "kn", i, "_res_", j, 'Clustree.pdf', sep = ""),
width = 1200, height = 2000, res = 300)
print(p5)
dev.off()
}
suppressWarnings(rm(p5))
}
}
#outside of kn loop
if (run.stats) { # save the stats
stats_df <- as.data.frame(matrix(statsl, ncol = 6, byrow = TRUE))
colnames(stats_df) <- c("kn", "resolution", "number.cluster", "silhouette.score", "calinski.harabasz", "davies.bouldin")
write.csv(stats_df, paste(save_to, clust_method, '_stats.csv', sep = ""), row.names = F)
}
if (run.stats && !is.null(save_to)) {
return(stats_df)
}
}
####################################################################################################
#Intrinsic stats
# takes in a dataframe or list of statistics
# plots intrinsic statistics from pararmater explorations
# c("kn", "resolution", "number.cluster", "silhouette.score", "calinski.harabasz", "davies.bouldin")
stats_plot <- function(stats_ls,
save_to,
clust_method) {
# drop any rows containing NAs, they contain NAs because some kn x res
#give number of clusters of 1 (there is no split), and you can't run
#internal stats on them
stats_ls <- stats_ls[complete.cases(stats_ls), ]
if (clust_method == "louvain") {
##silhouette score: ranges from -1 to 1
##-1: bad clusters 0: neutral, indifferent 1: good clusters
#x axis = number of cluster
siplot1 <- ggplot(stats_ls, aes(x = number.cluster, y = silhouette.score, label = resolution)) +
geom_line(aes(group = kn, color = factor(kn)), size = 0.15) +
geom_text(aes(label = resolution, colour = factor(kn)),
check_overlap = TRUE,
position = position_jitter(width = 0.2),
size = 3) +
labs(color = "kn", title = "Silhouette Scores",
x = "Number of Clusters", y = "Average Silhouette Scores") +
theme(plot.title = element_text(hjust = 0.5))
#x axis = kn
siplot2 <- ggplot(stats_ls, aes(kn, silhouette.score)) +
geom_point(aes(colour = factor(resolution), group = (resolution))) +
geom_line(aes(colour = factor(resolution), group = (resolution)), size = 0.2) +
labs(title = "Silhouette Scores",
x = "kn",
y = "Average Silhouette Scores",
colour = 'Resolution') +
theme(plot.title = element_text(hjust = 0.5))
##Calinski-Harabasz index:
## the highest value is the optimal number of clusters
#x axis = number of cluster
chplot1 <- ggplot(stats_ls, aes(x = number.cluster, y = calinski.harabasz, label = resolution)) +
geom_line(aes(group = kn,color = factor(kn)), size = 0.15) +
geom_text(aes(label = resolution, colour = factor(kn)),
check_overlap = TRUE,
position = position_jitter(width = 0.2),
size = 3) +
labs(color = "kn",
title = "Calinski-Harabasz Index",
x = "Number of Clusters",
y = "Calinski-Harabasz Index") +
theme(plot.title = element_text(hjust = 0.5))
#x axis = kn
chplot2 <- ggplot(stats_ls, aes(kn, calinski.harabasz)) +
geom_point(aes(colour = factor(resolution), group = factor(resolution))) +
geom_line(aes(colour = factor(resolution), group = factor(resolution)), size = 0.2) +
labs(title = "Calinski-Harabasz Index",
x = "kn",
y = "Calinski-Harabasz Index",
colour = 'Resolution') +
theme(plot.title = element_text(hjust = 0.5))
## #Davies–Bouldin index: minimum score is zero
## #the lowest value is the optimal number of clusters
#x axis = number of cluster
dbplot1 <- ggplot(stats_ls,
aes(x = number.cluster, y = db, label = resolution)) +
geom_line(aes(group = kn,color = factor(kn)), size = 0.15) +
geom_text(aes(label = resolution, colour = factor(kn)),
check_overlap = TRUE,
position = position_jitter(width = 0.2), size = 3) +
labs(color = "kn", title = "Davies-Bouldin index",
x = "Number of Clusters", y = "Davies-Bouldin index") +
theme(plot.title = element_text(hjust = 0.5))
#x axis = kn
dbplot2 <- ggplot(stats_ls, aes(kn, db)) +
geom_point(aes(colour = factor(resolution), group = factor(resolution))) +
geom_line(aes(colour = factor(resolution), group = factor(resolution)),
size = 0.2) +
labs(title = "Davies-Bouldin index",
x = "kn",
y = "Davies-Bouldin index",
colour = 'Resolution') +
theme(plot.title = element_text(hjust = 0.5))
} else if (clust_method == "flowsom") {
siplot1 <- ggplot(stats_ls) +
geom_point(aes(x = krange, y = silhouette.score)) +
geom_line(aes(x = krange, y = silhouette.score), size = 0.1) +
labs(title = "Silhouette Scores",
x = "krange", y = "Average Silhouette Scores") +
theme(plot.title = element_text(hjust = 0.5))
siplot2 <- ggplot(stats_ls) +
geom_point(aes(x = number.cluster, y = silhouette.score)) +
geom_line(aes(x = number.cluster, y = silhouette.score), size = 0.1) +
labs(title = "Silhouette Scores",
x = "Number of Clusters", y = "Average Silhouette Scores") +
theme(plot.title = element_text(hjust = 0.5))
chplot1 <- ggplot(stats_ls) +
geom_point(aes(x = krange, y = calinski.harabasz)) +
geom_line(aes(x = krange, y = calinski.harabasz), size = 0.1) +
labs(title = "Calinski-Harabasz Index",
x = "krange", y = "Calinski-Harabasz Index") +
theme(plot.title = element_text(hjust = 0.5))
chplot2 <- ggplot(stats_ls) +
geom_point(aes(x = number.cluster, y = calinski.harabasz)) +
geom_line(aes(x = number.cluster, y = calinski.harabasz), size = 0.1) +
labs(title = "Calinski-Harabasz Index",
x = "Number of Clusters", y = "Calinski-Harabasz Index") +
theme(plot.title = element_text(hjust = 0.5))
dbplot1 <- ggplot(stats_ls) +
geom_point(aes(x = krange, y = db)) +
geom_line(aes(x = krange, y = db), size = 0.1) +
labs(title = "Davies-Bouldin index",
x = "krange", y = "Davies-Bouldin index") +
theme(plot.title = element_text(hjust = 0.5))
dbplot2 <- ggplot(stats_ls) +
geom_point(aes(x = number.cluster, y = db)) +
geom_line(aes(x = number.cluster, y = db), size = 0.1) +
labs(title = "Davies-Bouldin index",
x = "Number of Clusters", y = "Davies-Bouldin index") +
theme(plot.title = element_text(hjust = 0.5))
} else if (clust_method == "phenograph") {
siplot1 <- ggplot(stats_ls) +
geom_point(aes(x = kn, y = silhouette.score)) +
geom_line(aes(x = kn, y = silhouette.score), size = 0.1) +
labs(title = "Silhouette Scores",
x = "kn", y = "Average Silhouette Scores") +
theme(plot.title = element_text(hjust = 0.5))
siplot2 <- ggplot(stats_ls) +
geom_point(aes(x = number.cluster, y = silhouette.score)) +
geom_line(aes(x = number.cluster, y = silhouette.score), size = 0.1) +
labs(title = "Silhouette Scores",
x = "Number of Clusters", y = "Average Silhouette Scores") +
theme(plot.title = element_text(hjust = 0.5))
chplot1 <- ggplot(stats_ls) +
geom_point(aes(x = kn, y = calinski.harabasz)) +
geom_line(aes(x = kn, y = calinski.harabasz), size = 0.1) +
labs(title = "Calinski-Harabasz Index",
x = "kn", y = "Calinski-Harabasz Index") +
theme(plot.title = element_text(hjust = 0.5))
chplot2 <- ggplot(stats_ls) +
geom_point(aes(x = number.cluster, y = calinski.harabasz)) +
geom_line(aes(x = number.cluster, y = calinski.harabasz), size = 0.1) +
labs(title = "Calinski-Harabasz Index",
x = "Number of Clusters", y = "Calinski-Harabasz Index") +
theme(plot.title = element_text(hjust = 0.5))
dbplot1 <- ggplot(stats_ls) +
geom_point(aes(x = kn, y = db)) +
geom_line(aes(x = kn, y = db), size = 0.1) +
labs(title = "Davies-Bouldin index",
x = "kn", y = "Davies-Bouldin index") +
theme(plot.title = element_text(hjust = 0.5))
dbplot2 <- ggplot(stats_ls) +
geom_point(aes(x = number.cluster, y = db)) +
geom_line(aes(x = number.cluster, y = db), size = 0.1) +
labs(title = "Davies-Bouldin index",
x = "Number of Clusters", y = "Davies-Bouldin index") +
theme(plot.title = element_text(hjust = 0.5))
} else (
warning("clust_method is incorrect. ")
)
pdf(paste(save_to, clust_method, "_stats_plot.pdf", sep = ""))
print(siplot1)
print(siplot2)
print(chplot1)
print(chplot2)
print(dbplot1)
print(dbplot2)
dev.off()
return(list(siplot1, siplot2, chplot1, chplot2, dbplot1, dbplot2))
}
####################################################################################################
# clust_stability
# select cluster method and one pararmeter to vary (resolutions 0.1,0.3,0.5,0.8)
# runs n iterations randomizing starting point - default 100
# calculates the RandIndex (RI) between each iteration of clustering for each resolution
# calculates the mean and standard deviation of the number of clusters and the RI for each resoloution
# outputs a table and plot of the results
Rand_index <- function(input,
resolutions,
kn,
n = 100, #number of iterations
save_to = NULL #if null, will not save seul, ril, ncl, rdf
) {
#list of random integers
rn_ls <- round(runif(n, min = 0, max = 100000), 0)
#final df with mean sd rand index and nc
rdf <- data.frame(matrix(ncol = 8, nrow = 0))
colnames(rdf) <- c('kn', 'resolution',
'meanri', 'medianri', 'sdri',
'meannc', 'mediannc', 'sdnc')
#helper functions:
hp1 <- function(x) { #store n repeats of the same ixj clustering
input <- FindClusters(input, random.seed = rn_ls[x], resolution = j)
return(Idents(object = input))
}
#find rand index between every 2 elements in a list, no repetition
#(ex: 1-2, 2-1) or same number (ex: 1-1, 2-2):
hp2 <- function(x) {
ri<-randIndex(table(as.numeric(seul[, x[1]]),
as.numeric(seul[, x[2]])))
return(ri)
}
#find number of clusters
hp3 <- function(x) { return((length(unique(x)))) }
#main loops:
for(i in kn) {
#shuming note: why dims=12 here?
input <- FindNeighbors(input, dims = 1:12, k.param = i)
input <- RunUMAP(input, dims = 1:12, n.neighbors = i)
for (j in resolutions) {
seul <- sapply(1:n, hp1) #list of n repeats of clustering (Ident(seu) object)
ncl <- apply(seul, 2, hp3) #list of number of clustering
ril <- apply(t(combn(1:n, 2)), 1, hp2) #list of rand index
if (!is.null(save_to)) {
saveRDS(seul,paste(save_to, "seu_ls_kn", i, "_j", j, ".Rds",sep=""))
saveRDS(ncl,paste(save_to, "nc_ls_kn", i, "_j", j, ".Rds",sep=""))
saveRDS(ncl,paste(save_to, "ri_ls_kn", i, "_j", j, ".Rds",sep=""))
}
rdf <-rbind(rdf, list(
kn = i, resolution = j,
meanri = mean(ril), medianri = median(ril), sdri = sd(ril),
meannc = mean(ncl), mediannc = median(ncl), sdnc = sd(ncl)))
}
}
p <- plot_randindex(rdf, c('pink', 'violet'), c(0.7, 1))
return(list(list = rdf, figure = p))
}
# Calculate RAND INDEX
plot_randindex <- function (
rdf,
cp = c("orange", "violet"),
view = c(0, 1) #zoom in x axis, this does not changes scales, just the viewable sections
) {
s <- (view[2]-view[1])/max(rdf$meannc+rdf$sdnc)
p <- rdf %>%
ggplot(aes(x = resolution)) +
geom_line(aes(y = meanri), color = cp[1]) +
geom_point(aes(y = meanri), color = cp[1], size=1)+
geom_errorbar(aes(ymin=meanri-sdri, ymax=meanri+sdri), color = cp[1], width=.01,
position=position_dodge(.9))+
geom_line(aes(y = meannc*s+view[1]), color = cp[2]) +
geom_point(data = rdf, mapping = aes(x = resolution, y = meannc*s+view[1]), color = cp[2])+
geom_errorbar(rdf, mapping = aes(x=resolution, ymin=((meannc-sdnc)*s+view[1]), ymax=((meannc+sdnc)*s+view[1])), width=.01,
position=position_dodge(.9), color = cp[2])+
scale_y_continuous(limits= view, name="Mean Rand Index",
sec.axis = sec_axis(~ . /s - view[1]/s,
name = "Mean Number of Clusters"))+
theme(axis.text.y = element_text(color = cp[1]),
axis.title.y = element_text(color=cp[1]),
axis.text.y.right = element_text(color = cp[2]),
axis.title.y.right = element_text(color=cp[2]),
plot.title = element_text(hjust = 0.5, size=10))+
ggtitle("Plot of Mean Rand Index and \n Mean Number of Clusters")
return(p)
}
#helper function:
#compare internal stats:
compare_cluster_stats <- function(lou.test,
pheno.test,
save_to = NULL) {
#number of cluster
data1 <- data.frame(x=c(lou.test$kn, pheno.test$kn),
value=c(lou.test$number.cluster, pheno.test$number.cluster),
variable=c(sub("^","louvain resolution=", lou.test$resolution), rep("phenograph", nrow(pheno.test))))
p1 <- ggplot(data = data1, aes(x=x, y=value)) +
geom_line(aes(colour=variable)) +
labs(title = "Number of Clusters of Different Clustering Methods",
x = "kn", y = "number of clusters",
color = "Clustering methods")
data2 <- data.frame(x=c(lou.test$kn, pheno.test$kn),
value=c(lou.test$silhouette.score, pheno.test$silhouette.score),
variable=c(sub("^","louvain resolution=", lou.test$resolution), rep("phenograph", nrow(pheno.test))))
p2 <- ggplot(data = data2, aes(x=x, y=value)) +
geom_line(aes(colour=variable)) +
labs(title = "Silhouette Score of Different Clustering Methods",
x = "kn", y = "silhouette score",
color = "Clustering methods")
data3 <- data.frame(x=c(lou.test$kn, pheno.test$kn),
value=c(lou.test$calinski.harabasz, pheno.test$calinski.harabasz),
variable=c(sub("^","louvain resolution=", lou.test$resolution), rep("phenograph", nrow(pheno.test))))
p3 <- ggplot(data = data3, aes(x=x, y=value)) +
geom_line(aes(colour=variable)) +
labs(title = "Calinski Harabasz Index of Different Clustering Methods",
x = "kn", y = "calinski harabasz index",
color = "Clustering methods")
data4 <- data.frame(x=c(lou.test$kn, pheno.test$kn),
value=c(lou.test$davies.bouldin, pheno.test$davies.bouldin),
variable=c(sub("^","louvain resolution=", lou.test$resolution), rep("phenograph", nrow(pheno.test))))
p4 <- ggplot(data = data4, aes(x=x, y=value)) +
geom_line(aes(colour=variable)) +
labs(title = "Davies Bouldin Index of Different Clustering Methods",
x = "kn", y = "Davies Bouldin index",
color = "Clustering methods")
print(p1)
print(p2)
print(p3)
print(p4)
if(!(is.null(save_to))){
png(filename=paste(save_to, "number_cluster_comparison.png", sep = ""), width = 1600, height = 1600, res = 300)
print(p1)
dev.off()
png(filename=paste(save_to, "silhouette_score_comparison.png", sep = ""), width = 1600, height = 1600, res = 300)
print(p2)
dev.off()
png(filename=paste(save_to, "calinski_harabasz_comparison.png", sep = ""), width = 1600, height = 1600, res = 300)
print(p3)
dev.off()
png(filename=paste(save_to, "davies_bouldin_comparison.png", sep = ""), width = 1600, height = 1600, res = 300)
print(p4)
dev.off()
}
}
########## cluster function
# choose cluster conditions to make the seurat object
# method can be "louvain", "phenograph", "flowsom"
get_clusters <- function(seu, method = "louvain",
df_input = NULL, #needed if input "flowsom"
k = 60, #k for flowsom or kn for Phenograph and Seurat Louvain
resolution = 0.8,
plots = TRUE,
save_plots = FALSE) {
}
p1 <- FeaturePlot(input, features = rownames(input),
slot = 'scale.data',
min.cutoff = 'q1',
max.cutoff ='99',
label.size = 1)+
theme(plot.title = element_text(size = 0.1))
Error in is(x, "classRepresentation") : object 'input' not found
print(p1)
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'x' in selecting a method for function 'print': object 'p1' not found
# try plot function for phenograph outputs
stats_plot(stats_ls = pheno.test,
save_to = output_path,
clust_method = "phenograph")
Error in complete.cases(stats_ls) : invalid 'type' (unknown) of argument
Test functions Louvain
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/exp_clusters/seu/"
lou.test <- explore_param(input = seu, #for phenograph and louvain only
cluster_method = "louvain", #take 1 cluster method
df_input = NULL, #needed if input "flowsom"
flow_k = NULL, #k for flowsom
pheno_lou_kn = c(40,100), #kn for phenograph or louvain
lou_resolution = c(0.25,0.5,1), #resolution for louvain
run.plot = TRUE, #print if run
run.stats = TRUE, #print and return if run
save_to = NULL)
Computing nearest neighbor graph
Computing SNN
15:53:19 UMAP embedding parameters a = 0.9922 b = 1.112
15:53:19 Read 73578 rows and found 12 numeric columns
15:53:19 Using Annoy for neighbor search, n_neighbors = 40
15:53:19 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:53:24 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmp2693yU/file1b1d6ea92ebe
15:53:24 Searching Annoy index using 1 thread, search_k = 4000
15:53:53 Annoy recall = 100%
15:53:55 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 40
15:53:59 Initializing from normalized Laplacian + noise (using irlba)
15:54:08 Commencing optimization for 200 epochs, with 4194988 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:54:59 Optimization finished
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 4682994
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8983
Number of communities: 13
Elapsed time: 46 seconds
Error in dim(x) <- c(length(x), 1) : attempt to set an attribute on NULL
Check cluster stability - best resolution/cluster number/k value by calculating RAND Index and STD of cluster number
RI <- Rand_index(input = seu,
resolutions = c(0.1,0.25,0.5,0.8,1),
kn = 60,
n = 10, #number of iterations
save_to = NULL #if null, will not save seul, ril, ncl, rdf
)
Computing nearest neighbor graph
Computing SNN
10:40:20 UMAP embedding parameters a = 0.9922 b = 1.112
10:40:20 Read 73578 rows and found 12 numeric columns
10:40:20 Using Annoy for neighbor search, n_neighbors = 60
10:40:20 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
10:40:25 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmp2693yU/file1b1d2532f990
10:40:26 Searching Annoy index using 1 thread, search_k = 6000
10:41:05 Annoy recall = 100%
10:41:07 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 60
10:41:14 Initializing from normalized Laplacian + noise (using irlba)
10:41:27 Commencing optimization for 200 epochs, with 6191984 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
10:42:25 Optimization finished
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9302
Number of communities: 8
Elapsed time: 53 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
******************************Warning in doTryCatch(return(expr), name, parentenv, handler) :
restarting interrupted promise evaluation
********************|
Maximum modularity in 10 random starts: 0.9303
Number of communities: 8
Elapsed time: 52 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9302
Number of communities: 7
Elapsed time: 48 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9301
Number of communities: 7
Elapsed time: 57 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9303
Number of communities: 8
Elapsed time: 60 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9302
Number of communities: 8
Elapsed time: 51 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9303
Number of communities: 8
Elapsed time: 53 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9303
Number of communities: 8
Elapsed time: 51 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9303
Number of communities: 9
Elapsed time: 57 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9302
Number of communities: 8
Elapsed time: 52 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9010
Number of communities: 14
Elapsed time: 61 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9007
Number of communities: 14
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9005
Number of communities: 14
Elapsed time: 52 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8998
Number of communities: 14
Elapsed time: 62 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8996
Number of communities: 14
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9006
Number of communities: 14
Elapsed time: 60 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9008
Number of communities: 14
Elapsed time: 52 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9004
Number of communities: 14
Elapsed time: 51 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9005
Number of communities: 14
Elapsed time: 56 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9006
Number of communities: 14
Elapsed time: 49 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8692
Number of communities: 18
Elapsed time: 276 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8703
Number of communities: 18
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8697
Number of communities: 18
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8697
Number of communities: 18
Elapsed time: 60 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8698
Number of communities: 18
Elapsed time: 53 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8695
Number of communities: 18
Elapsed time: 56 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8693
Number of communities: 18
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8694
Number of communities: 18
Elapsed time: 55 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8689
Number of communities: 18
Elapsed time: 53 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8695
Number of communities: 18
Elapsed time: 53 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8456
Number of communities: 21
Elapsed time: 56 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8457
Number of communities: 22
Elapsed time: 50 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8455
Number of communities: 22
Elapsed time: 61 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8459
Number of communities: 21
Elapsed time: 53 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8454
Number of communities: 23
Elapsed time: 59 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8456
Number of communities: 22
Elapsed time: 64 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8454
Number of communities: 23
Elapsed time: 57 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8456
Number of communities: 23
Elapsed time: 55 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8454
Number of communities: 21
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8451
Number of communities: 22
Elapsed time: 61 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8323
Number of communities: 25
Elapsed time: 51 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8325
Number of communities: 25
Elapsed time: 59 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8326
Number of communities: 25
Elapsed time: 57 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8325
Number of communities: 25
Elapsed time: 58 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8328
Number of communities: 25
Elapsed time: 62 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8327
Number of communities: 23
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8324
Number of communities: 25
Elapsed time: 50 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8323
Number of communities: 26
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8326
Number of communities: 25
Elapsed time: 51 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8324
Number of communities: 25
Elapsed time: 57 seconds
#Look at the RI results
plot_randindex(
rdf = RI$list,
cp = c("orange", "violet"),
view = c(0, 1) #zoom in x axis, this does not changes scales, just the viewable sections
)
## test RAND Index again with more res, less reps - add in the plotting
RI <- Rand_index(input = seu,
resolutions = c(0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.75,1,1.25,1.5),
kn = 60,
n = 2, #number of iterations
save_to = NULL #if null, will not save seul, ril, ncl, rdf
)
Computing nearest neighbor graph
Computing SNN
14:38:07 UMAP embedding parameters a = 0.9922 b = 1.112
14:38:07 Read 73578 rows and found 12 numeric columns
14:38:07 Using Annoy for neighbor search, n_neighbors = 60
14:38:08 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:38:13 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmp2693yU/file1b1d1a7b469a
14:38:13 Searching Annoy index using 1 thread, search_k = 6000
14:38:54 Annoy recall = 100%
14:38:54 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 60
14:39:00 Initializing from normalized Laplacian + noise (using irlba)
14:39:13 Commencing optimization for 200 epochs, with 6191984 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:40:11 Optimization finished
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9523
Number of communities: 3
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9523
Number of communities: 3
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9302
Number of communities: 8
Elapsed time: 51 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9303
Number of communities: 9
Elapsed time: 56 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9069
Number of communities: 13
Elapsed time: 54 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9080
Number of communities: 13
Elapsed time: 53 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8929
Number of communities: 15
Elapsed time: 55 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8934
Number of communities: 14
Elapsed time: 58 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8806
Number of communities: 16
Elapsed time: 53 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8797
Number of communities: 15
Elapsed time: 61 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8698
Number of communities: 18
Elapsed time: 55 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8697
Number of communities: 18
Elapsed time: 58 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8611
Number of communities: 18
Elapsed time: 56 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8605
Number of communities: 18
Elapsed time: 49 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8493
Number of communities: 21
Elapsed time: 53 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8490
Number of communities: 23
Elapsed time: 49 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8329
Number of communities: 24
Elapsed time: 53 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8322
Number of communities: 25
Elapsed time: 52 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8194
Number of communities: 26
Elapsed time: 47 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8193
Number of communities: 27
Elapsed time: 45 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8057
Number of communities: 28
Elapsed time: 47 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8063
Number of communities: 29
Elapsed time: 52 seconds
After checking parameters run cluster function to make add the clustering information into the
seu <- get_clusters(seu, method = "louvain",
df_input = NULL, #needed if input "flowsom"
k = 60, #k for flowsom or kn for Phenograph and Seurat Louvain
resolution = 0.5,
plots = TRUE,
save_plots = FALSE)
[1] "method is Louvain"
Computing nearest neighbor graph
Computing SNN
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8701
Number of communities: 18
Elapsed time: 59 seconds
15:36:17 UMAP embedding parameters a = 0.4502 b = 1.076
15:36:17 Read 73578 rows and found 12 numeric columns
15:36:17 Using Annoy for neighbor search, n_neighbors = 60
15:36:17 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:36:22 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmp2693yU/file1b1d1358360b
15:36:22 Searching Annoy index using 1 thread, search_k = 6000
15:37:05 Annoy recall = 100%
15:37:06 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 60
15:37:12 Initializing from normalized Laplacian + noise (using irlba)
15:37:25 Commencing optimization for 200 epochs, with 6191984 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:38:23 Optimization finished
seu <- get_clusters(seu, method = "flowsom",
df_input = df.input, #needed if input "flowsom"
k = 12, #k for flowsom or kn for Phenograph and Seurat Louvain
resolution = NULL,
plots = TRUE,
save_plots = FALSE)
[1] "method is flowsom"
Building SOM
Mapping data to SOM
Building MST
14:32:18 UMAP embedding parameters a = 0.4502 b = 1.076
14:32:18 Read 73578 rows and found 12 numeric columns
14:32:18 Using Annoy for neighbor search, n_neighbors = 12
14:32:18 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:32:24 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmp2693yU/file1b1d74270f0c
14:32:24 Searching Annoy index using 1 thread, search_k = 1200
14:32:36 Annoy recall = 100%
14:32:36 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 12
14:32:38 Initializing from normalized Laplacian + noise (using irlba)
14:32:41 Commencing optimization for 200 epochs, with 1205744 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:33:13 Optimization finished
# save with graph
saveRDS(seu,"/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/seu9000March08.RDS")
Annotate clusters 1. Visualization for manual annotation. - output by clustering function 2. CAM (Correlation assignment model) - requires reference matrix 3. RFM (Random Forest Model) - requires annotated matching flow dataset 4. Seurat label transfer - requires annotated matching flow data in a seurat object
Visualize expression on UMAP and with heat maps
AB <- c("CD24","CD56","CD29","CD15","CD184","CD133","CD71","CD44","GLAST","AQP4","HepaCAM", "CD140a","O4")
# this will let us see one at at time
for (i in AB) {
print(FeaturePlot(seu, features = i, min.cutoff = 'q1', max.cutoff = 'q97', label = TRUE))
}
NA
NA
Some more visualization of expression values
# summary heat map
# use function plotmean
length(unique(seu$RNA_snn_res.0.8))
[1] 22
# there are 22 clusters
cluster.num <- c(0:21)
plotmean(plot_type = 'heatmap',seu = seu, group = 'RNA_snn_res.0.8', markers = AB,
var_names = cluster.num, slot = 'scale.data', xlab = "Cluster",
ylab = "Antibody")
NA
NA
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/"
saveRDS(seu, paste(output_path,"cluster9000.RDS"))
Predict cell annotations with CAM (Corralations assignment method)
reference_path <- "/Users/rhalenathomas/GITHUB/CelltypeR/Data/ReferenceMatrix9celltypesOrdered.csv"
test_data <- read.csv("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/exp_clusters/df9000.csv")
reference_data <- read.csv(reference_path)
cor <- find_correlation(test_data,
reference_data,
min_corr = 0.5,
min_diff = 0.05)
# creates a dataframe with cor1 cor2 and predicted cell type label
Visualize the CAM results
plot_corr(cor)
Warning in melt(df) :
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(df). In the next version, this warning will become an error.
Using X, best.cell.type, second.cell.type, cell.label as id variables
Warning in melt(df.downsample) :
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(df.downsample). In the next version, this warning will become an error.
Using X, best.cell.type, second.cell.type, cell.label as id variables
Warning in melt(double.cells) :
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(double.cells). In the next version, this warning will become an error.
Using X, best.cell.type, second.cell.type, cell.label as id variables
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
Warning: Removed 35 rows containing missing values (`geom_line()`).
Warning: Removed 35 rows containing missing values (`geom_point()`).
Apply correlation predictions to clusters and output a vector for annotation functions
unique(seu$cor.labels)
[1] "unknown" "RG" "StemCell"
[4] "Oligo" "Neuron" "StemCell-Neuron"
[7] "OPC" "Endothelial" "Neuron-StemCell"
[10] "Endothelial-RG" "NPC" "Neuron-NPC"
[13] "NPC-StemCell" "RG-Endothelial" "RG-Astrocyte"
[16] "NPC-Neuron" "Astrocyte" "OPC-Neuron"
[19] "StemCell-NPC" "Neuron-OPC" "OPC-Oligo"
[22] "Astrocyte-RG" "Oligo-OPC" "Endothelial-Astrocyte"
[25] "OPC-StemCell" "StemCell-OPC" "Oligo-NPC"
[28] "NPC-OPC" "Oligo-Neuron" "NPC-Oligo"
[31] "OPC-NPC" "Oligo-StemCell" "StemCell-Oligo"
Run get annotations function to return a vector of annotation in the order of the clusters.
length(cor.ann$CAM)
[1] 22
Use a trained Random Forest model to predict cell types. Training of the Random Forest model with an annotated data set is below.
# you must have a saved trained model from a data object annotated from the same markers
rf <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/PaperFigures/RFM_trained.11072022.Rds")
rfm.pred <- RFM_predict(seu, rf)
head(rfm.pred)
# add the predictions into the seurat object
seu <- AddMetaData(object=seu, metadata=rfm.pred$`predict(rf, df)`, col.name = 'rfm.labels')
# check that the data is added
table(seu$rfm.labels)
Get the annotation by cluster for the RFM
rfm.ann <- get_annotation(seu, seu$RNA_snn_res.0.8,seu$rfm.labels,
top_n = 3, filter_out = c("unknown","Unknown","Mixed","Mix"), Label = "RFM")
rfm.ann
#rfm.ann <- get_annotation(seu, seu$RNA_snn_res.0.8,seu$rfm.labels,
# top_n = 3, filter_out = FALSE, Label = "RFM")
rfm.ann
dim(rfm.ann)
Plot RFM predictions
plot_lab_clust(seu, seu.cluster = seu$RNA_snn_res.0.8, seu.labels = seu$rfm.labels, filter_out = c("unknown","Unknown","Mixed"))
Predicting cell types with Seurat label transfer using anchors
# takes in a seurat object with the labels added
# makes a dataframe with the count of predicted labels for each cluster
# input seurat object with the predicted labels in the meta data
# input the clusters meta data slot to be labels
# input the meta data slot with the labels (correlation, random forest, seurat predicted)
#need reference data object with labels
seu.r<- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/PaperFigures/Seu9000annot.08072021.RDS")
# the output is a seurat object with the predicted annotations
seu <- seurat_predict(seu, seu.r, ref_id = 'subgroups', down.sample = 500, markers = AB)
# plot the seurat anchor predictions
# get the annotation table for the seurat anchor predictions
plot_lab_clust(seu, seu$RNA_snn_res.0.8, seu$seu.pred)
# to not filter anything use c()
seu.ann <- get_annotation(seu, seu$RNA_snn_res.0.8,seu$seu.pred,
top_n = 3, filter_out = c(), Label = "Seurat")
seu.ann
Get a consensus of cluster annotations, Add the annotations to the seurat object
ann.list <- list(man.ann,df.cor.ann,rfm.ann,seu.ann)
# annotate the seurat object
seu <- cluster_annotate(seu, ann.list,
annotation_name ="CellType",
to_label = "RNA_snn_res.0.8")
DimPlot(seu, group.by = "CellType")
NA
NA
Just use the annotate functions
seu <- annotate(seu, annotations = man.ann$manual, to_label = "RNA_snn_res.0.8", annotation_name = "MyCellType")
DimPlot(seu, group.by = "MyCellType")
#save with the annotations
saveRDS(seu,"/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/seu9000Feb24.RDS")
Compare groups
seu <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/seu9000Feb24.RDS")
Add the variables into the seurat object
Genotype <- c("3450","3450","3450","AIW002","AIW002","AIW002","AJG001C","AJG001C","AJG001C")
ExDate <- c("0306","0317","0317","0306","0317","0317","0306","0317","0317")
Batch <- c("B","B","A","B","B","A","B","B","A")
Age <- c("273","284","263","273","284","263","273","284","263")
# Genotype
Idents(seu) <- "Sample"
cluster.ids <- Genotype
# vector with the new names - you need this vector from me
names(cluster.ids) <- levels(seu) # get the levels
seu <- RenameIdents(seu, cluster.ids) # rename
seu$Genotype <- Idents(seu) # add a new dataslot
# Experiment date
Idents(seu) <- "Sample"
cluster.ids <- ExDate
# vector with the new names - you need this vector from me
names(cluster.ids) <- levels(seu) # get the levels
seu <- RenameIdents(seu, cluster.ids) # rename
seu$ExDate <- Idents(seu) # add a new dataslot
# Batch
Idents(seu) <- "Sample"
cluster.ids <- Batch
# vector with the new names - you need this vector from me
names(cluster.ids) <- levels(seu) # get the levels
seu <- RenameIdents(seu, cluster.ids) # rename
seu$Batch <- Idents(seu) # add a new dataslot
# days in final differentiation media
Idents(seu) <- "Sample"
cluster.ids <- Age
# vector with the new names - you need this vector from me
names(cluster.ids) <- levels(seu) # get the levels
seu <- RenameIdents(seu, cluster.ids) # rename
seu$DaysinCulture <- Idents(seu) # add a new dataslot
Plots some variables
# one plot
proportionplots(seu.q,seu.var = seu$Genotype, seu.lable = seu$CellType, groups = "Genotype")
[1] "Number of colours needed17"
[1] "Number of colours entered 1"
[1] "Default ggplot colours used"
# add colours
colours <- c("chocolate1","chocolate3","orange",
"lightsalmon", "pink","lightpink3",
"steelblue3","deepskyblue",
"plum3","purple",
"seagreen3","tomato4","burlywood3","grey90","brown",
"royalblue3", "tan4","yellowgreen")
proportionplots(seu.q,seu.var = seu$Genotype, seu.lable = seu$CellType, groups = "Genotype", my_colours = colours)
[1] "Number of colours needed17"
[1] "Number of colours entered 18"
[1] "Custome colours used."
var.list <- list(seu$DaysinCulture,seu$Batch,seu$ExDate,seu$Genotype)
varnames <- c("Days in Culture", "Batch", "Experiment Date", "Genotype")
# plot all the variables of interest at once
plotproportions(seu, var.list = var.list, xgroup = seu$CellType, varnames = varnames, my_colours = c("blue","red"))
[1] "Number of colours needed17"
[1] "Number of colours entered 2"
[1] "Default ggplot colours used"
[1] "Number of colours needed17"
[1] "Number of colours entered 2"
[1] "Default ggplot colours used"
[1] "Number of colours needed17"
[1] "Number of colours entered 2"
[1] "Default ggplot colours used"
[1] "Number of colours needed17"
[1] "Number of colours entered 2"
[1] "Default ggplot colours used"
Heatmaps
# make sure the order is correct
celltypes <- c("unknown","radial glia 1", "astrocytes 1", "mixed","neurons 1",
"neurons 2", "epithelial", "astrocytes mature", "npc", "radial glia 2",
"radial glia 3", "endothelial","neurons 3", "astrocytes 2",
"oligodendrocytes", "stem-like 1","neural stem")
plotmean(plot_type = 'heatmap',seu = seu, group = 'CellType', markers = AB,
var_names = celltypes, slot = 'scale.data', xlab = "Cell Type",
ylab = "Antibody")
NA
NA
# dot plot
og.order <- c("unknown","radial glia 1", "astrocytes 1", "mixed","neurons 1",
"neurons 2", "epithelial", "astrocytes mature", "npc", "radial glia 2",
"radial glia 3", "endothelial","neurons 3", "astrocytes 2",
"oligodendrocytes", "stem-like 1","neural stem")
# make sure the terms are exactly the same and you don't miss any
new.order <- c("astrocytes 1","astrocytes 2","astrocytes mature",
"endothelial","epithelial", "mixed","neurons 1",
"neurons 2","neurons 3","neural stem","npc",
"oligodendrocytes",
"radial glia 1","radial glia 2","radial glia 3","stem-like 1",
"unknown")
new.order <- rev(new.order)
AB.order <- c("CD24","CD56","CD29","CD15","CD184","CD133","CD71","CD44","GLAST","AQP4","HepaCAM", "CD140a","O4")
plotmean(plot_type = 'dotplot',seu = seu, group = 'CellType', markers = AB,
var_names = celltypes, slot = 'scale.data', xlab = "Cell Type",
ylab = "Antibody", var1order = new.order, var2order = AB.order)
NA
NA
Statistics comparing groups
# prepare a dataframe for stats
# this function takes the annotated seurat object with all the variables already existing as metadata slots
# check what meta data slots exist in your object
colnames(seu@meta.data)
[1] "orig.ident" "nCount_RNA" "nFeature_RNA" "Sample"
[5] "RNA_snn_res.0.8" "seurat_clusters" "cor.labels" "rfm.labels"
[9] "seu.pred" "CellType" "MyCellType" "Genotype"
[13] "ExDate" "Batch" "DaysinCulture"
# run the function
var.names <- c("Sample","DaysinCulture", "Batch", "ExDate", "Genotype", "CellType")
df.for.stats <- Prep_for_stats(seu, marker_list = AB, variables = var.names,
marker_name = 'Marker')
# save the csv for later
write.csv(df.for.stats, "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/df4statsFeb26.csv")
head(df.for.stats)
NA
NA
First we have 3 variables: Experimental variable, Cell types, Markers We want to compare the mean expression per group: For all antibodies together in each cell type between a given variable (Genotype) (One way anova) We want to compare each antibody separately for a given variable for each cell type (two way anova)
head(df_means)
dim(df_means)
[1] 1898 8
Train Random Forest model Requires a labelled seurat object More cells will give higher accuracy but increase computation time. Run in HPC with lots of cells
rf <- RFM_train(seurate_object = seu,
AB_list = AB, annotations = seu$CellType3,
split = c(0.8,0.2),
downsample = 20000,
seed = 222,
mytry = c(1:10),
maxnodes = c(12: 25),
trees = c(250, 500, 1000,2000),
start_node = 15)
save(rf, output_path,"trainedRFMFeb14.Rds")